# coding=utf-8
from locale import *
import sys
import datetime
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup


def get_mac_address():
    import uuid
    node = uuid.getnode()
    mac = uuid.UUID(int=node).hex[-12:]
    return mac


def validate_mac_address():
    import urllib.request
    f = urllib.request.urlopen('http://amazon-ceping.xunhuanle.com/publicwelcome/getallmacaddress')
    ret_content_bytes = f.read()
    ret_content_str = ret_content_bytes.decode()
    return ret_content_str

def set_driver():
    # 设置不加载图片
    firefoxProfile = FirefoxProfile()
    firefoxProfile.set_preference('permissions.default.image', 2)
    firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
    driver = webdriver.Firefox(firefoxProfile)
    return driver

def run_crawler(url, run_times, input_ping_lun_num, line_number, key_words, open_file):
    driver = set_driver()
    driver.get(url)
    i = 1
    while i < run_times:
        try:
            i += 1
            current_list_url = driver.current_url
            print('CURRENT LIST PAGE URL:' + current_list_url)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            allItems = soup.select('li.s-result-item')
            try:
                #尝试找下一页按钮，找到了进入下一页，出现异常，就看是不是到最后一页了，如果是，就找下一个词的。
                WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.ID, 'pagnNextLink')))
                nextPageUrl = driver.find_element_by_id("pagnNextLink").get_attribute("href")
                #print('NEXT LIST PAGE URL: '+nextPageUrl)
            except Exception:
                try:
                    WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, 'proceedWarning')))
                    #proceedWarning = driver.find_element_by_class_name("proceedWarning")
                    print('This is the last page,We go to next type of link!')
                    driver.quit()
                    break
                except Exception:
                    print("can't find the last page tips!!! We try to find captchacharacters ID")
                    # 这里需要判断是不是出现验证页面，如果是先报警，提醒人工验证
                    try:
                        WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.ID, 'captchacharacters')))
                        #运行到这，说明出现了验证码，先关闭浏览器，并重启显示图片的浏览器。以方便显示输入验证
                        valid_url = driver.current_url
                        print("VALID_URL=", valid_url)
                        driver.quit()
                        firefox_profile = FirefoxProfile()
                        firefox_profile.set_preference('permissions.default.image', 1)
                        firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'true')
                        driver = webdriver.Firefox(firefox_profile)
                        driver.get(valid_url)
                        #continue
                    except Exception:
                        print("Unknow Error!!!")
                        driver.quit()
                        break
            for item in allItems:
                span_name = item.select_one('span[name]')
                if span_name is not None:
                    # 查询产品状态是否unavailable是否可用
                    item_text = item.get_text()
                    if item_text is not None:
                        if key_words in item_text:
                            asin_text = span_name.get("name")
                            star_ele = span_name.select_one('i.a-icon-star > span')
                            if star_ele is not None:
                                star_text = star_ele.get_text()
                            else:
                                star_text = "Null"
                            pingLun_ele = span_name.select_one(' + a')
                            if pingLun_ele is not None:
                                pingLun_text = pingLun_ele.get_text()
                                pingLunNum = int(atof(pingLun_text))
                                # 获取URL
                                m_url = item.select_one('h2').parent.get("href")
                                # 判断评论数是否是大于指定评论数的，如果是，再请求他的url判断是否是真的是需要的url并记录在文件中
                                if pingLunNum >= input_ping_lun_num:
                                    i += 1
                                    if i >= run_times:
                                        # 重新设置driver
                                        i = 1
                                        driver.quit()
                                        driver = set_driver()
                                    driver.get(m_url.strip())
                                    soup_validate = BeautifulSoup(driver.page_source, "html.parser")
                                    form_twister = soup_validate.select_one('#twister')
                                    if form_twister is None:
                                        print('NEED URL:' + m_url + " write to file")
                                        file = open(open_file, 'a')
                                        file.writelines(line_number + '\t' + current_list_url + '\t' + m_url + '\t' + asin_text + '\t' + star_text + '\t' + pingLun_text + '\n')
                                        file.close()
                                    else:
                                        print("NOT NEED URL:" + m_url)
                                else:
                                    print('PINGLUN NUM:' + pingLun_text + ' Comments number is too low to need!')
            if i < run_times:
                driver.get(nextPageUrl)
            else:
                driver.quit()
                run_crawler(nextPageUrl, run_times, input_ping_lun_num, line_number, key_words, open_file)
        except Exception:
            print('ERROR， RERUN NEXT URL.')
            run_crawler(nextPageUrl, run_times, input_ping_lun_num, line_number, key_words, open_file)






mac_address = get_mac_address()
print("Your macaddress is below:")
print(mac_address)
validation_content = validate_mac_address()
if mac_address not in validation_content:
    print("Please submit your unicode '"+mac_address+"' to administrator!!!")
    #sys.exit()

file_name = "found_link_"+datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S')+".xls"

#search_url = input("Input Search URL:")
#print("Search url is:", search_url)

#导入的需要抓取链接的文件。维持更新格式：序号+'\t'+对应搜索的词+'\t'+手动筛选的链接
file_name_input = input("Input Will Import Link File Name:")
print("Import Link File Name is:", file_name_input)

#上次中断时候的文件序列号
last_broken_line_num = input("Input Last Broken Line Number(Leave Empty If Starting With Head):")
print("Last Broken Line Number(Leave Empty If Starting With Head):", last_broken_line_num)

#上次中断时候的列表页链接
last_broken_list_page_link = input("Input Last Broken List Page Link(Leave Empty If Starting With Head):")
print("Last Broken List Page Link(Leave Empty If Starting With Head):", last_broken_list_page_link)

#这一国家显示的不可用的关键词
keyWords = input("Input Unavailable Keywords:")
keyWords = keyWords.strip()
print("Unavailable Keywords is:", keyWords)

#过滤掉评论数少于给定的值的
inputPingLunNum = input("Comments Number:")
inputPingLunNum = inputPingLunNum.strip()
print("Comments Number is:", inputPingLunNum)

#运行请求多少次链接后重启浏览器
runTimes = input("Run Times Every Cycle:")
runTimes = runTimes.strip()
print("Run Times Every Cycle is:", runTimes)

setlocale(LC_NUMERIC, 'English_US')

imported_file = open(file_name_input, 'r')
for line in imported_file:
    line_data_arr = line.split('\t')
    crawler_url = line_data_arr[2].strip()
    line_num = line_data_arr[0].strip()
    print("LINE NUMBER: " + line_num + " " + crawler_url)

    try:
        if last_broken_line_num.strip() != '':
            # 给定行号大于遍历的
            if int(line_num) < int(last_broken_line_num):
                print('LINE ' + line_num + 'HAD HANDLERED!!!')
                continue
            elif int(line_num) == int(last_broken_line_num):
                # 判断是否给定了中断的列表页页数link
                if last_broken_list_page_link.strip() !='':
                    run_crawler(last_broken_list_page_link, int(runTimes), int(inputPingLunNum), line_num, keyWords, file_name)
                else:
                    run_crawler(crawler_url, int(runTimes), int(inputPingLunNum), line_num, keyWords, file_name)
            else:
                run_crawler(crawler_url, int(runTimes), int(inputPingLunNum), line_num, keyWords, file_name)
        else:
            run_crawler(crawler_url, int(runTimes), int(inputPingLunNum), line_num, keyWords, file_name)
    except Exception:
        print('SOMETHING IS WRONG,CONTINUE!!!')
        continue






